Some people have been reporting problems in the value of net_value column in the CEAP datasets. It doesn't seem to really match what it should contain.
import pandas as pd
import numpy as np
filenames = ['../data/2016-08-08-current-year.xz',
'../data/2016-08-08-last-year.xz',
'../data/2016-08-08-previous-years.xz']
dataset = pd.DataFrame()
for filename in filenames:
data = pd.read_csv(filename,
parse_dates=[16],
dtype={'document_id': np.str,
'congressperson_id': np.str,
'congressperson_document': np.str,
'term_id': np.str,
'cnpj_cpf': np.str,
'reimbursement_number': np.str})
dataset = pd.concat([dataset, data])
dataset['issue_date'] = pd.to_datetime(dataset['issue_date'], errors='coerce')
(dataset['document_value'].isnull()).sum()
dataset[dataset['document_value'].isnull()]
dataset[dataset['document_value'].isnull()].iloc[0]
import math
dataset = dataset.dropna(subset=['document_value'])
dataset['document_value_int'] = (dataset['document_value'] * 100.).apply(math.ceil).astype(np.int)
dataset['remark_value_int'] = (dataset['remark_value'] * 100.).apply(math.ceil).astype(np.int)
dataset['net_value_int'] = (dataset['net_value'] * 100.).apply(math.ceil).astype(np.int)
dataset['calc_net_value_int'] = dataset['document_value_int'] - dataset['remark_value_int']
((dataset['calc_net_value_int'] - dataset['net_value_int']) != 0).sum()
dataset.iloc[0]
dataset['diff_net_value'] = dataset['calc_net_value_int'] - dataset['net_value_int']
dataset.loc[dataset['diff_net_value'] != 0, 'diff_net_value'].describe()
with_significant_difference = dataset.loc[dataset['diff_net_value'].abs() > 2]
with_significant_difference['subquota_description'].describe()
from altair import *
Chart(with_significant_difference).mark_bar().encode(
x=X('subquota_description:O',
sort=SortField(field='subquota_description',
order='descending',
op='count')),
y='count(*):Q',
)
with_significant_difference.iloc[0]